User comparison tests

Preparation



In [ ]:

    
%run "../Functions/1. Google form analysis.ipynb"
%run "../Functions/4. User comparison.ipynb"

Data vectors of users



In [ ]:

    
#getAllResponders()



In [ ]:

    
setAnswerTemporalities(gform)

getAllUserVectorData



In [ ]:

    
# small sample
#allData = getAllUserVectorData( getAllUsers( rmdf1522 )[:10] )

# complete set
#allData = getAllUserVectorData( getAllUsers( rmdf1522 ) )

# subjects who answered the gform
allData = getAllUserVectorData( getAllResponders() )

# 10 subjects who answered the gform
#allData = getAllUserVectorData( getAllResponders()[:10] )



In [ ]:

    
efficiencies = allData.loc['efficiency'].sort_values()
efficiencies.index = range(0, len(allData.columns))
efficiencies.plot(title = 'efficiency')



In [ ]:

    
efficiencies2 = allData.loc['efficiency'].sort_values()
efficiencies2 = efficiencies2[efficiencies2 != 0]
efficiencies2.index = range(0, len(efficiencies2))
efficiencies2 = np.log(efficiencies2)
efficiencies2.plot(title = 'efficiency log')



In [ ]:

    
maxChapter = allData.loc['maxChapter'].sort_values()
maxChapter.index = range(0, len(allData.columns))
maxChapter.plot(title = 'maxChapter')



In [ ]:

    
len(allData.columns)



In [ ]:

    
userIds = getAllResponders()
_source = correctAnswers



In [ ]:

    
# _source is used as correction source, if we want to include answers to these questions
#def getAllUserVectorData( userIds, _source = [] ):
    
# result
isInitialized = False
allData = []

f = FloatProgress(min=0, max=len(userIds))
display(f)

for userId in userIds:
    #print(str(userId))
    f.value += 1
    if not isInitialized:
        isInitialized = True
        allData = getUserDataVector(userId, _source = _source)
    else:
        allData = pd.concat([allData, getUserDataVector(userId, _source = _source)], axis=1)

#print('done')
allData



In [ ]:

    
userId

Correlation Matrix



In [ ]:

    
methods = ['pearson', 'kendall', 'spearman']

_allUserVectorData = allData.T
_method = methods[0]
_title='RedMetrics Correlations'
_abs=True
_clustered=False
_figsize = (20,20)


#def plotAllUserVectorDataCorrelationMatrix(
#    _allUserVectorData,
#    _method = methods[0], 
#    _title='RedMetrics Correlations', 
#    _abs=False,
#    _clustered=False, 
#    _figsize = (20,20)
#):
    
_progress = FloatProgress(min=0, max=3)
display(_progress)

# computation of correlation matrix
_m = _method
if(not (_method in methods)):
    _m = methods[0]
_correlation = _allUserVectorData.astype(float).corr(_m)
_progress.value += 1
if(_abs):
    _correlation = _correlation.abs()
_progress.value += 1

# plot
if(_clustered):
    sns.clustermap(_correlation,cmap=plt.cm.jet,square=True,figsize=_figsize)
else:
    _fig = plt.figure(figsize=_figsize)
    _ax = plt.subplot(111)
    _ax.set_title(_title)
    sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True)
_progress.value += 1



In [ ]:



In [ ]:



In [ ]:

    
gform[QTemporality].unique()



In [ ]:

    
allData.loc['scoreundefined'].dropna()



In [ ]:

    
getAllUsers(rmdf1522)[:10]



In [ ]:

    
len(getAllUsers(rmdf1522))

List of users and their sessions



In [ ]:

    
userSessionsRelevantColumns = ['customData.localplayerguid', 'sessionId']
userSessions = rmdf1522[rmdf1522['type']=='start'].loc[:,userSessionsRelevantColumns]



In [ ]:

    
userSessions = userSessions.rename(index=str, columns={'customData.localplayerguid': 'userId'})
userSessions.head()



In [ ]:

    
#groupedUserSessions = userSessions.groupby('customData.localplayerguid')
#groupedUserSessions.head()
#groupedUserSessions.describe().head()

List of sessions with their checkpoints achievements



In [ ]:

    
checkpointsRelevantColumns = ['sessionId', 'customData.localplayerguid', 'type', 'section', 'userTime']
checkpoints = rmdf1522.loc[:, checkpointsRelevantColumns]

checkpoints = checkpoints[checkpoints['type']=='reach'].loc[:,['section','sessionId','userTime']]
checkpoints = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
#checkpoints = checkpoints.groupby("sessionId")
#checkpoints = checkpoints.max()
checkpoints.head()

Assembly of both



In [ ]:

    
#assembled = userSessions.combine_first(checkpoints)
assembled = pd.merge(userSessions, checkpoints, on='sessionId', how='outer')
assembled.head()



In [ ]:

    
userSections = assembled.drop('sessionId', 1)
userSections.head()



In [ ]:

    
userSections = userSections.dropna()
userSections.head()



In [ ]:

    
checkpoints = userSections.groupby("userId")
checkpoints = checkpoints.max()
checkpoints.head()

Time analysis



In [ ]:

    
#userTimedSections = userSections.groupby("userId").agg({ "userTime": np.min })
#userTimedSections = userSections.groupby("userId")
userTimes = userSections.groupby("userId").agg({ "userTime": [np.min, np.max] })
userTimes["duration"] = pd.to_datetime(userTimes["userTime"]["amax"]) - pd.to_datetime(userTimes["userTime"]["amin"])
userTimes["duration"] = userTimes["duration"].map(lambda x: np.timedelta64(x, 's'))
userTimes = userTimes.sort_values(by=['duration'], ascending=[False])
userTimes.head()

TODO

userTimes.loc[:,'duration'] userTimes = userTimes[4:] userTimes["duration_seconds"] = userTimes["duration"].map(lambda x: pd.Timedelta(x).seconds) maxDuration = np.max(userTimes["duration_seconds"]) userTimes["duration_rank"] = userTimes["duration_seconds"].rank(ascending=False) userTimes.plot(x="duration_rank", y="duration_seconds") plt.xlabel("game session") plt.ylabel("time played (s)") plt.legend('') plt.xlim(0, 139) plt.ylim(0, maxDuration)

userTimedSections = userSections.groupby("section").agg({ "userTime": np.min }) userTimedSections

userTimedSections["firstReached"] = pd.to_datetime(userTimedSections["userTime"]) userTimedSections.head()

userTimedSections.drop('userTime', 1) userTimedSections.head()

userTimedSections["firstCompletionDuration"] = userTimedSections["firstReached"].diff() userTimedSections.head()



In [ ]:

    
sessionCount = 1
_rmDF = rmdf1522
sample = gform
before = False
after = True
gfMode = False
rmMode = True

#def getAllUserVectorDataCustom(before, after, gfMode = False, rmMode = True, sessionCount = 1, _rmDF = rmdf1522)
userIds = []

if (before and after):
    userIds = getSurveysOfUsersWhoAnsweredBoth(sample, gfMode = gfMode, rmMode = rmMode)
elif before:
    if rmMode:
        userIds = getRMBefores(sample)
    else:
        userIds = getGFBefores(sample)
elif after:
    if rmMode:
        userIds = getRMAfters(sample)
    else:
        userIds = getGFormAfters(sample)
if(len(userIds) > 0):
    userIds = userIds[localplayerguidkey]
    allUserVectorData = getAllUserVectorData(userIds, _rmDF = _rmDF)
    allUserVectorData = allUserVectorData.T
    result = allUserVectorData[allUserVectorData['sessionsCount'] == sessionCount].T

else:
    print("no matching user")
    result = []



In [ ]:

    
result



In [ ]:

    
getAllUserVectorDataCustom(False, True)



In [ ]:

    
userIdsBoth = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = True)[localplayerguidkey]
allUserVectorData = getAllUserVectorData(userIdsBoth)
allUserVectorData = allUserVectorData.T
allUserVectorData[allUserVectorData['sessionsCount'] == 1]

user progress classification

tinkering



In [ ]:

    
testUser = "3685a015-fa97-4457-ad73-da1c50210fe1"



In [ ]:

    
def getScoreFromBinarized(binarizedAnswers):
    gformIndices = binarizedAnswers.index.map(lambda s: int(s.split(correctionsColumnNameStem)[1]))
    return pd.Series(np.dot(binarizedAnswers, np.ones(binarizedAnswers.shape[1])), index=gform.loc[gformIndices, localplayerguidkey])



In [ ]:

    
#allResponders = getAllResponders()

#gf_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = False)
rm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = False, rmMode = True)
#gfrm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = True)

sciBinarizedBefore = getAllBinarized(_form = getRMBefores(rm_both))
sciBinarizedAfter = getAllBinarized(_form = getRMAfters(rm_both))

scoresBefore = getScoreFromBinarized(sciBinarizedBefore)
scoresAfter = getScoreFromBinarized(sciBinarizedAfter)



In [ ]:

    
medianBefore = np.median(scoresBefore)
medianAfter = np.median(scoresAfter)
maxScore = sciBinarizedBefore.shape[1]



In [ ]:

    
indicators = pd.DataFrame()
indicators[answerTemporalities[0]] = scoresBefore
indicators[answerTemporalities[1]] = scoresAfter

indicators['delta'] = scoresAfter - scoresBefore
indicators['maxPotentialDelta'] = maxScore - scoresBefore
for index in indicators['maxPotentialDelta'].index:
    if (indicators.loc[index, 'maxPotentialDelta'] == 0):
        indicators.loc[index, 'maxPotentialDelta'] = 1 

indicators['relativeBefore'] = scoresBefore / medianBefore
indicators['relativeAfter'] = scoresAfter / medianBefore
indicators['relativeDelta'] = indicators['delta'] / medianBefore
indicators['realizedPotential'] = indicators['delta'] / indicators['maxPotentialDelta']
indicators['increaseRatio'] = indicators[answerTemporalities[0]]
for index in indicators['increaseRatio'].index:
    if (indicators.loc[index, 'increaseRatio'] == 0):
        indicators.loc[index, 'increaseRatio'] = 1 
indicators['increaseRatio'] = indicators['delta'] / indicators['increaseRatio']



In [ ]:

    
indicators



In [ ]:

    
(min(indicators['relativeBefore']), max(indicators['relativeBefore'])),\
(min(indicators['relativeDelta']), max(indicators['relativeDelta'])),\
medianBefore,\
np.median(indicators['relativeBefore']),\
np.median(indicators['relativeDelta'])\



In [ ]:

    
indicatorX = 'relativeBefore'
indicatorY = 'relativeDelta'

def scatterPlotIndicators(indicatorX, indicatorY):
    
    print(indicatorX + ' range: ' + str((min(indicators[indicatorX]), max(indicators[indicatorX]))))
    print(indicatorY + ' range: ' + str((min(indicators[indicatorY]), max(indicators[indicatorY]))))
    print(indicatorX + ' median: ' + str(np.median(indicators[indicatorX])))
    print(indicatorY + ' median: ' + str(np.median(indicators[indicatorY])))
    
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.scatter(indicators[indicatorX], indicators[indicatorY])
    plt.xlabel(indicatorX)
    plt.ylabel(indicatorY)

    # vertical line
    plt.plot(   [np.median(indicators[indicatorX]), np.median(indicators[indicatorX])],\
                [min(indicators[indicatorY]), max(indicators[indicatorY])],\
             'k-', lw=2)

    # horizontal line
    plt.plot(   [min(indicators[indicatorX]), max(indicators[indicatorX])],\
                [np.median(indicators[indicatorY]), np.median(indicators[indicatorY])],\
             'k-', lw=2)



In [ ]:

    
indicators.columns



In [ ]:

    
scatterPlotIndicators('relativeBefore', 'relativeDelta')



In [ ]:

    
scatterPlotIndicators('relativeBefore', 'realizedPotential')



In [ ]:

    
scatterPlotIndicators('relativeBefore', 'increaseRatio')



In [ ]:

    
scatterPlotIndicators('relativeBefore', 'relativeAfter')



In [ ]:

    
scatterPlotIndicators('maxPotentialDelta', 'realizedPotential')



In [ ]: